Sentiment Analysis on Movie Reviews using LSTM RNN Model

  • 0 - negative

  • 1 - somewhat negative

  • 2 - neutral

  • 3 - somewhat positive

  • 4 - positive

Load Libraries


In [30]:
import numpy as np
import pandas as pd

from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Load and Read Datasets


In [2]:
train = pd.read_csv('train.tsv', sep='\t', header=0)
test = pd.read_csv('test.tsv', sep='\t', header=0)

In [3]:
train.shape, test.shape


Out[3]:
((156060, 4), (66292, 3))

In [4]:
train.head()


Out[4]:
PhraseId SentenceId Phrase Sentiment
0 1 1 A series of escapades demonstrating the adage ... 1
1 2 1 A series of escapades demonstrating the adage ... 2
2 3 1 A series 2
3 4 1 A 2
4 5 1 series 2

In [5]:
test.head()


Out[5]:
PhraseId SentenceId Phrase
0 156061 8545 An intermittently pleasing but mostly routine ...
1 156062 8545 An intermittently pleasing but mostly routine ...
2 156063 8545 An
3 156064 8545 intermittently pleasing but mostly routine effort
4 156065 8545 intermittently pleasing but mostly routine

In [6]:
raw_docs_train = train['Phrase'].values
raw_docs_test = test['Phrase'].values
sentiment_train = train['Sentiment'].values
num_labels = len(np.unique(sentiment_train))

In [7]:
np.unique(sentiment_train)


Out[7]:
array([0, 1, 2, 3, 4])

Preprocessing Data


In [8]:
stop_words = set(stopwords.words('english'))
print (stop_words)


set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', u'from', u'her', u'their', u'aren', u'there', u'been', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'until', u'more', u'himself', u'that', u'but', u'don', u'with', u'than', u'those', u'he', u'me', u'myself', u'ma', u'these', u'up', u'will', u'below', u'ain', u'can', u'theirs', u'my', u'and', u've', u'then', u'is', u'am', u'it', u'doesn', u'an', u'as', u'itself', u'at', u'have', u'in', u'any', u'if', u'again', u'no', u'when', u'same', u'how', u'other', u'which', u'you', u'shan', u'needn', u'haven', u'after', u'most', u'such', u'why', u'a', u'off', u'i', u'm', u'yours', u'so', u'y', u'the', u'having', u'once'])

In [9]:
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
print (stop_words)


set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', ',', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', '[', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', '(', '{', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', '"', u'from', u'her', u'their', u'aren', u'there', u'been', '.', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'until', u'more', u'himself', u'that', u'but', ';', u'don', u'with', u'than', u'those', u'he', u'me', u'myself', ':', u'ma', u'these', u'up', u'will', u'below', u'ain', u'can', u'theirs', u'my', u'and', u've', u'then', u'is', u'am', u'it', u'doesn', u'an', u'as', u'itself', u'at', u'have', u'in', u'any', u'if', u'again', u'no', ')', u'when', u'same', u'how', u'other', u'which', u'you', u'shan', u'needn', u'haven', u'after', u'most', u'such', ']', u'why', u'a', u'off', "'", u'i', u'm', u'yours', u'so', u'y', u'the', '}', u'having', u'once'])

In [10]:
stemmer = SnowballStemmer('english')

In [11]:
print "pre-processing train docs..."
processed_docs_train = []
for index, doc in enumerate(raw_docs_train):
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_train.append(stemmed)
    
    if index == 0:
        print ('\n')
        print (doc)
        print ('\n')
        print (tokens)
        print ('\n')
        print (filtered)
        print ('\n')
        print (stemmed)


pre-processing train docs...


A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .


['A', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.']


['A', 'series', 'escapades', 'demonstrating', 'adage', 'good', 'goose', 'also', 'good', 'gander', 'occasionally', 'amuses', 'none', 'amounts', 'much', 'story']


['a', u'seri', u'escapad', u'demonstr', u'adag', u'good', u'goos', u'also', u'good', u'gander', u'occasion', u'amus', u'none', u'amount', u'much', u'stori']

In [12]:
print "pre-processing test docs..."
processed_docs_test = []
for doc in raw_docs_test:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_test.append(stemmed)


pre-processing test docs...

In [13]:
processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)

In [14]:
dictionary = corpora.Dictionary(processed_docs_all)
dictionary_size = len(dictionary.keys())
print "dictionary size: ", dictionary_size


dictionary size:  13759

In [15]:
dictionary[0], dictionary[14]


Out[15]:
(u'a', u'seri')

In [16]:
print "converting to token ids..."
word_id_train, word_id_len = [], []
for index,doc in enumerate(processed_docs_train):
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_train.append(word_ids)
    word_id_len.append(len(word_ids))
    
    if index == 0:
        print (doc)
        print (word_ids)
        print (word_id_train)
        print (word_id_len)


converting to token ids...
['a', u'seri', u'escapad', u'demonstr', u'adag', u'good', u'goos', u'also', u'good', u'gander', u'occasion', u'amus', u'none', u'amount', u'much', u'stori']
[0, 14, 12, 13, 11, 2, 9, 6, 2, 5, 10, 4, 1, 3, 8, 7]
[[0, 14, 12, 13, 11, 2, 9, 6, 2, 5, 10, 4, 1, 3, 8, 7]]
[16]

In [17]:
word_id_test, word_ids = [], []
for doc in processed_docs_test:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_test.append(word_ids)
    word_id_len.append(len(word_ids))

In [18]:
seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)
print (np.mean(word_id_len))
print (np.std(word_id_len))
print (seq_len)


4.16991976686
3.80478385787
12

Padding Sequences


In [19]:
#pad sequences
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)

In [20]:
print (word_id_train)


[[   11     2     9 ...,     3     8     7]
 [    0     0     0 ...,    11     2     9]
 [    0     0     0 ...,     0     0    14]
 ..., 
 [    0     0     0 ...,     0 11849 11850]
 [    0     0     0 ...,     0     0 11849]
 [    0     0     0 ...,     0     0 11850]]

In [21]:
print (y_train_enc)


[[ 0.  1.  0.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  1.  0.  0.]
 ..., 
 [ 0.  0.  0.  1.  0.]
 [ 0.  0.  1.  0.  0.]
 [ 0.  0.  1.  0.  0.]]

Training LSTM RNN Model

Long short-term memory (LSTM) is a recurrent neural network (RNN) architecture that remembers values over arbitrary intervals. Stored values are not modified as learning proceeds. RNNs allow forward and backward connections between neurons.

An LSTM network contains LSTM units instead of, or in addition to, other network units. An LSTM unit remembers values for either long or short time periods. The key to this ability is that it uses no activation function within its recurrent components. Thus, the stored value is not iteratively modified and the gradient does not tend to vanish when trained with backpropagation through time.

keras.layers.recurrent.LSTM(units, activation='tanh', recurrent_activation='hard_sigmoid', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', unit_forget_bias=True, kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.0)

units: Positive integer, dimensionality of the output space.

dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the inputs.

recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for the linear transformation of the recurrent state.

Source: https://keras.io/layers/recurrent/#lstm

Embedding Layer

Embedding Layer is used to:

  • One-hot encoded vectors are high-dimensional and sparse. Let's assume that we are doing Natural Language Processing (NLP) and have a dictionary of 2000 words. This means that, when using one-hot encoding, each word will be represented by a vector containing 2000 integers. And 1999 of these integers are zeros. In a big dataset this approach is not computationally efficient.
  • The vectors of each embedding get updated while training the neural network. This allows us to visualize relationships between words, but also between everything that can be turned into a vector through an embedding layer.

keras.layers.embeddings.Embedding(input_dim, output_dim, embeddings_initializer='uniform', embeddings_regularizer=None, activity_regularizer=None, embeddings_constraint=None, mask_zero=False, input_length=None)

Turns positive integers (indexes) into dense vectors of fixed size. eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]

This layer can only be used as the first layer in a model.

Source: https://keras.io/layers/embeddings/

Example:

model.add(Embedding(1000, 64, input_length=10))

In the above example code, the model will take as input an integer matrix of size (batch, input_length). The largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size). Now model.output_shape == (None, 10, 64), where None is the batch dimension.


In [27]:
#LSTM
print "fitting LSTM ..."
model = Sequential()
model.add(Embedding(dictionary_size, 128))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


fitting LSTM ...

In [28]:
model.fit(word_id_train, y_train_enc, epochs=3, batch_size=256, verbose=1)


Epoch 1/3
156060/156060 [==============================] - 422s - loss: 0.9993 - acc: 0.5981   
Epoch 2/3
156060/156060 [==============================] - 450s - loss: 0.8258 - acc: 0.6625   
Epoch 3/3
156060/156060 [==============================] - 380s - loss: 0.7732 - acc: 0.6798   
Out[28]:
<keras.callbacks.History at 0x7efe5d259b50>

Using Convolutional Neural Network (CNN) + LSTM

We add a one-dimensional CNN Conv1D() and a max pooling layer MaxPooling1D() after the Embedding layer which then feed the features to the LSTM. We use a set of 32 features with filter length of 3. The pooling layer has the standard length of 2 to halve the feature map size.


In [31]:
#LSTM
print "fitting LSTM ..."
model = Sequential()
model.add(Embedding(dictionary_size, 128))

model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# sigmoid activation for binary classification
# softmax activation for multi-class classification
model.add(Dense(num_labels, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


fitting LSTM ...

In [32]:
model.fit(word_id_train, y_train_enc, epochs=3, batch_size=256, verbose=1)


Epoch 1/3
156060/156060 [==============================] - 228s - loss: 1.0164 - acc: 0.5926   
Epoch 2/3
156060/156060 [==============================] - 242s - loss: 0.8160 - acc: 0.6635   
Epoch 3/3
156060/156060 [==============================] - 219s - loss: 0.7515 - acc: 0.6867   
Out[32]:
<keras.callbacks.History at 0x7efe9d4f5990>

Creating Submission


In [33]:
test_pred = model.predict_classes(word_id_test)


66292/66292 [==============================] - 61s    

In [36]:
test_pred


Out[36]:
array([2, 2, 2, ..., 1, 1, 2])

In [37]:
#make a submission
test['Sentiment'] = test_pred.reshape(-1,1) 
header = ['PhraseId', 'Sentiment']
test.to_csv('./submission_lstm_cnn.csv', columns=header, index=False, header=True)